Create the InferCNV Object

Reading in the raw counts matrix and meta data, populating the infercnv object

infercnv_obj = CreateInfercnvObject(
  raw_counts_matrix="oligodendroglioma_expression_downsampled.counts.matrix",
  annotations_file="oligodendroglioma_annotations_downsampled.txt",
  delim="\t",
  gene_order_file="gencode_downsampled.txt",
  ref_group_names=c("Microglia/Macrophage","Oligodendrocytes (non-malignant)"))
<<<<<<< HEAD
## INFO [2018-10-22 13:10:54] ::order_reduce:Start.
## INFO [2018-10-22 13:10:54] .order_reduce(): expr and order match.
## INFO [2018-10-22 13:10:54] ::process_data:order_reduce:Reduction from positional data, new dimensions (r,c) = 10338,184 Total=18322440.6799817 Min=0 Max=34215.
## INFO [2018-10-22 13:10:54] validating infercnv_obj
=======
## INFO [2018-10-31 13:35:02] ::order_reduce:Start.
## INFO [2018-10-31 13:35:02] .order_reduce(): expr and order match.
## INFO [2018-10-31 13:35:02] ::process_data:order_reduce:Reduction from positional data, new dimensions (r,c) = 10338,184 Total=18322440.6799817 Min=0 Max=34215.
## INFO [2018-10-31 13:35:02] validating infercnv_obj
>>>>>>> 29a0b973d2701fe5ea2834efcd6a82dd542e0308

Filtering genes

Removing those genes that are very lowly expressed or present in very few cells

# filter out low expressed genes
cutoff=1
infercnv_obj <- require_above_min_mean_expr_cutoff(infercnv_obj, cutoff)
<<<<<<< HEAD
## INFO [2018-10-22 13:10:54] ::above_min_mean_expr_cutoff:Start
## INFO [2018-10-22 13:10:55] Removing 3025 genes from matrix as below mean expr threshold: 1
## INFO [2018-10-22 13:10:55] validating infercnv_obj
## INFO [2018-10-22 13:10:55] There are 7313 genes and 184 cells remaining in the expr matrix.
# filter out bad cells
min_cells_per_gene=3
infercnv_obj <- require_above_min_cells_ref(infercnv_obj, min_cells_per_gene=min_cells_per_gene)
## INFO [2018-10-22 13:10:55] Removed 117 genes having fewer than 3 min cells per gene = 1.59989 % genes removed here
## INFO [2018-10-22 13:10:55] validating infercnv_obj
=======
## INFO [2018-10-31 13:35:02] ::above_min_mean_expr_cutoff:Start
## INFO [2018-10-31 13:35:02] Removing 3025 genes from matrix as below mean expr threshold: 1
## INFO [2018-10-31 13:35:02] validating infercnv_obj
## INFO [2018-10-31 13:35:02] There are 7313 genes and 184 cells remaining in the expr matrix.
# filter out bad cells
min_cells_per_gene=3
infercnv_obj <- require_above_min_cells_ref(infercnv_obj, min_cells_per_gene=min_cells_per_gene)
## INFO [2018-10-31 13:35:02] Removed 117 genes having fewer than 3 min cells per gene = 1.59989 % genes removed here
## INFO [2018-10-31 13:35:02] validating infercnv_obj
>>>>>>> 29a0b973d2701fe5ea2834efcd6a82dd542e0308
## for safe keeping
infercnv_orig_filtered = infercnv_obj
#plot_mean_chr_expr_lineplot(infercnv_obj)
save('infercnv_obj', file = 'infercnv_obj.orig_filtered')

Normalize each cell’s counts for sequencing depth

Perform a total sum normalization. Generates counts-per-million or counts-per-100k, depending on the overall sequencing depth.

infercnv_obj <- infercnv:::normalize_counts_by_seq_depth(infercnv_obj)
<<<<<<< HEAD
## INFO [2018-10-22 13:10:55] Computed total sum normalization factor as: 100000.000000

perform Anscombe normalization

Suggested for removing noisy variation at low counts

=======
## INFO [2018-10-31 13:35:03] Computed total sum normalization factor as: 100000.000000

Spike in artificial variation for tracking purposes

Add ~0x and 2x variation to an artificial spike-in data set based on the normal cells so we can track and later scale residual expression data to this level of variation.

infercnv_obj <- spike_in_variation_chrs(infercnv_obj)
## INFO [2018-10-31 13:35:03] Selecting longest chrs for adding spike: chr1,chr2
## INFO [2018-10-31 13:35:03] processing group: malignant_MGH36
## INFO [2018-10-31 13:35:03] processing group: malignant_MGH53
## INFO [2018-10-31 13:35:03] processing group: malignant_93
## INFO [2018-10-31 13:35:03] processing group: malignant_97
## INFO [2018-10-31 13:35:04] processing group: Microglia/Macrophage
## INFO [2018-10-31 13:35:04] processing group: Oligodendrocytes (non-malignant)
## INFO [2018-10-31 13:35:22] validating infercnv_obj

perform Anscombe normalization

Useful noise reduction method.
See: https://en.wikipedia.org/wiki/Anscombe_transform

>>>>>>> 29a0b973d2701fe5ea2834efcd6a82dd542e0308
infercnv_obj <- infercnv:::anscombe_transform(infercnv_obj)
save('infercnv_obj', file='infercnv_obj.anscombe')

log transform the normalized counts:

infercnv_obj <- log2xplus1(infercnv_obj)
<<<<<<< HEAD
## INFO [2018-10-22 13:10:56] transforming log2xplus1()
=======
## INFO [2018-10-31 13:35:23] transforming log2xplus1()
>>>>>>> 29a0b973d2701fe5ea2834efcd6a82dd542e0308
save('infercnv_obj', file='infercnv_obj.log_transformed')

Apply maximum bounds to the expression data to reduce outlier effects

Here we define a threshold by taking the mean of the bounds of expression data across all cells. This is then use to define a cap for the bounds of all data.

threshold = mean(abs(get_average_bounds(infercnv_obj))) 
infercnv_obj <- apply_max_threshold_bounds(infercnv_obj, threshold=threshold)
<<<<<<< HEAD
## INFO [2018-10-22 13:10:57] ::process_data:setting max centered expr, threshold set to: +/-:  4.33717214157707
=======
## INFO [2018-10-31 13:35:24] ::process_data:setting max centered expr, threshold set to: +/-:  4.29366139176392
>>>>>>> 29a0b973d2701fe5ea2834efcd6a82dd542e0308

Initial view, before inferCNV operations:

plot_cnv(infercnv_obj, 
         output_filename='infercnv.logtransf', 
         x.range="auto", 
         title = "Before InferCNV (filtered & log2 transformed)", 
         color_safe_pal = FALSE, 
         x.center = mean(infercnv_obj@expr.data))
knitr::include_graphics("infercnv.logtransf.png")
<<<<<<< HEAD

=======

>>>>>>> 29a0b973d2701fe5ea2834efcd6a82dd542e0308

perform smoothing across chromosomes

The expression values are

infercnv_obj = smooth_by_chromosome(infercnv_obj, window_length=101, smooth_ends=TRUE)
<<<<<<< HEAD
## INFO [2018-10-22 13:11:14] ::smooth_window:Start.
## INFO [2018-10-22 13:11:14] ::smooth_window:Start.
## INFO [2018-10-22 13:11:15] ::smooth_window:Start.
## INFO [2018-10-22 13:11:15] ::smooth_window:Start.
## INFO [2018-10-22 13:11:15] ::smooth_window:Start.
## INFO [2018-10-22 13:11:16] ::smooth_window:Start.
## INFO [2018-10-22 13:11:16] ::smooth_window:Start.
## INFO [2018-10-22 13:11:16] ::smooth_window:Start.
## INFO [2018-10-22 13:11:17] ::smooth_window:Start.
## INFO [2018-10-22 13:11:17] ::smooth_window:Start.
## INFO [2018-10-22 13:11:17] ::smooth_window:Start.
## INFO [2018-10-22 13:11:17] ::smooth_window:Start.
## INFO [2018-10-22 13:11:18] ::smooth_window:Start.
## INFO [2018-10-22 13:11:18] ::smooth_window:Start.
## INFO [2018-10-22 13:11:18] ::smooth_window:Start.
## INFO [2018-10-22 13:11:18] ::smooth_window:Start.
## INFO [2018-10-22 13:11:19] ::smooth_window:Start.
## INFO [2018-10-22 13:11:19] ::smooth_window:Start.
## WARN [2018-10-22 13:11:19] window length exceeds number of rows in data
## WARN [2018-10-22 13:11:19] setting window length to nrows
## INFO [2018-10-22 13:11:19] ::smooth_window:Start.
## INFO [2018-10-22 13:11:20] ::smooth_window:Start.
## INFO [2018-10-22 13:11:20] ::smooth_window:Start.
## WARN [2018-10-22 13:11:20] window length exceeds number of rows in data
## WARN [2018-10-22 13:11:20] setting window length to nrows
## INFO [2018-10-22 13:11:20] ::smooth_window:Start.
## INFO [2018-10-22 13:11:20] ::smooth_window:Start.
## INFO [2018-10-22 13:11:20] ::smooth_window:Start.
## WARN [2018-10-22 13:11:20] window length exceeds number of rows in data
## WARN [2018-10-22 13:11:20] setting window length to nrows
=======
## INFO [2018-10-31 13:35:48] ::smooth_window:Start.
## INFO [2018-10-31 13:35:48] ::smooth_window:Start.
## INFO [2018-10-31 13:35:49] ::smooth_window:Start.
## INFO [2018-10-31 13:35:49] ::smooth_window:Start.
## INFO [2018-10-31 13:35:49] ::smooth_window:Start.
## INFO [2018-10-31 13:35:49] ::smooth_window:Start.
## INFO [2018-10-31 13:35:50] ::smooth_window:Start.
## INFO [2018-10-31 13:35:50] ::smooth_window:Start.
## INFO [2018-10-31 13:35:50] ::smooth_window:Start.
## INFO [2018-10-31 13:35:51] ::smooth_window:Start.
## INFO [2018-10-31 13:35:51] ::smooth_window:Start.
## INFO [2018-10-31 13:35:51] ::smooth_window:Start.
## INFO [2018-10-31 13:35:51] ::smooth_window:Start.
## INFO [2018-10-31 13:35:52] ::smooth_window:Start.
## INFO [2018-10-31 13:35:52] ::smooth_window:Start.
## INFO [2018-10-31 13:35:52] ::smooth_window:Start.
## INFO [2018-10-31 13:35:53] ::smooth_window:Start.
## INFO [2018-10-31 13:35:53] ::smooth_window:Start.
## INFO [2018-10-31 13:35:53] ::smooth_window:Start.
## INFO [2018-10-31 13:35:53] ::smooth_window:Start.
## INFO [2018-10-31 13:35:54] ::smooth_window:Start.
## INFO [2018-10-31 13:35:54] ::smooth_window:Start.
## INFO [2018-10-31 13:35:54] ::smooth_window:Start.
## INFO [2018-10-31 13:35:54] ::smooth_window:Start.
>>>>>>> 29a0b973d2701fe5ea2834efcd6a82dd542e0308
save('infercnv_obj', file='infercnv_obj.smooth_by_chr')

# re-center each cell
infercnv_obj <- center_cell_expr_across_chromosome(infercnv_obj, method = "median")
<<<<<<< HEAD
## INFO [2018-10-22 13:11:21] ::center_smooth across chromosomes per cell
=======
## INFO [2018-10-31 13:35:55] ::center_smooth across chromosomes per cell
>>>>>>> 29a0b973d2701fe5ea2834efcd6a82dd542e0308
save('infercnv_obj', file='infercnv_obj.cells_recentered')
plot_cnv(infercnv_obj, 
         output_filename='infercnv.chr_smoothed', 
         x.range="auto", 
         title = "chr smoothed and cells re-centered", 
         color_safe_pal = FALSE)
knitr::include_graphics("infercnv.chr_smoothed.png")
<<<<<<< HEAD

=======

>>>>>>> 29a0b973d2701fe5ea2834efcd6a82dd542e0308

subtract the reference values from observations, now have log(fold change) values

infercnv_obj <- subtract_ref_expr_from_obs(infercnv_obj, inv_log=TRUE)
<<<<<<< HEAD
## INFO [2018-10-22 13:11:40] ::subtract_ref_expr_from_obs:Start
## INFO [2018-10-22 13:11:41] subtracting mean(normal) per gene per cell across all data
=======
## INFO [2018-10-31 13:36:21] ::subtract_ref_expr_from_obs:Start
## INFO [2018-10-31 13:36:21] subtracting mean(normal) per gene per cell across all data
>>>>>>> 29a0b973d2701fe5ea2834efcd6a82dd542e0308
save('infercnv_obj', file='infercnv_obj.ref_subtracted')
plot_cnv(infercnv_obj, 
         output_filename='infercnv.ref_subtracted', 
         x.range="auto", 
         title="ref subtracted", 
         color_safe_pal = FALSE)
knitr::include_graphics("infercnv.ref_subtracted.png")
<<<<<<< HEAD

=======

>>>>>>> 29a0b973d2701fe5ea2834efcd6a82dd542e0308

invert log values

Converting the log(FC) values to regular fold change values, centered at 1 (no fold change)

This is important because we want (1/2)x to be symmetrical to 1.5x, representing loss/gain of one chromosome region.

infercnv_obj <- invert_log2(infercnv_obj)
<<<<<<< HEAD
## INFO [2018-10-22 13:12:00] invert_log2(), computing 2^x
=======
## INFO [2018-10-31 13:36:46] invert_log2(), computing 2^x
>>>>>>> 29a0b973d2701fe5ea2834efcd6a82dd542e0308
save('infercnv_obj', file='infercnv_obj.inverted_log')
plot_cnv(infercnv_obj, 
         output_filename='infercnv.inverted', 
         color_safe_pal = FALSE, 
         x.range="auto", 
         x.center=1, 
         title = "inverted log FC to FC")
knitr::include_graphics("infercnv.inverted.png")
<<<<<<< HEAD

Removing noise

infercnv_obj <- clear_noise_via_ref_mean_sd(infercnv_obj, sd_amplifier = 1.0)
## INFO [2018-10-22 13:12:16] :: **** clear_noise_via_ref_quantiles **** : removing noise between bounds:  0.915981505347197 - 1.08562773127562
=======

Removing noise

infercnv_obj <- clear_noise_via_ref_mean_sd(infercnv_obj, sd_amplifier = 1.5)
## INFO [2018-10-31 13:37:10] :: **** clear_noise_via_ref_quantiles **** : removing noise between bounds:  0.917085999266254 - 1.08405842882653
>>>>>>> 29a0b973d2701fe5ea2834efcd6a82dd542e0308
save('infercnv_obj', file='infercnv_obj.denoised')
plot_cnv(infercnv_obj, 
         output_filename='infercnv.denoised', 
         x.range="auto", 
         x.center=1, 
         title="denoised", 
         color_safe_pal = FALSE)
knitr::include_graphics("infercnv.denoised.png")
<<<<<<< HEAD

=======

>>>>>>> 29a0b973d2701fe5ea2834efcd6a82dd542e0308

Remove outlier data points

This generally improves on the visualization

infercnv_obj = remove_outliers_norm(infercnv_obj)
<<<<<<< HEAD
## INFO [2018-10-22 13:12:32] ::remove_outlier_norm:Start out_method: average_bound lower_bound: NA upper_bound: NA
## INFO [2018-10-22 13:12:32] ::remove_outlier_norm:Start out_method: average_bound lower_bound: NA upper_bound: NA
## INFO [2018-10-22 13:12:32] ::remove_outlier_norm using method: average_bound for defining outliers.
## INFO [2018-10-22 13:12:32] outlier bounds defined between: 0.353023 - 3.81125
=======
## INFO [2018-10-31 13:37:32] ::remove_outlier_norm:Start out_method: average_bound lower_bound: NA upper_bound: NA
## INFO [2018-10-31 13:37:32] ::remove_outlier_norm:Start out_method: average_bound lower_bound: NA upper_bound: NA
## INFO [2018-10-31 13:37:32] ::remove_outlier_norm using method: average_bound for defining outliers.
## INFO [2018-10-31 13:37:32] outlier bounds defined between: 0.70462 - 1.35119
>>>>>>> 29a0b973d2701fe5ea2834efcd6a82dd542e0308
save('infercnv_obj', file="infercnv_obj.outliers_removed")
plot_cnv(infercnv_obj, 
         output_filename='infercnv.outliers_removed', 
         color_safe_pal = FALSE, 
         x.range="auto", 
         x.center=1, 
         title = "outliers removed")
knitr::include_graphics("infercnv.outliers_removed.png")
<<<<<<< HEAD

Find DE genes by comparing the mutant types to normal types, BASIC

Runs a t-Test comparing tumor/normal for each patient and normal sample, and masks out those genes that are not significantly DE.

plot_data = infercnv_obj@expr.data
high_threshold = max(abs(quantile(plot_data[plot_data != 0], c(0.05, 0.95))))  

low_threshold = -1 * high_threshold 

infercnv_obj <- infercnv:::mask_non_DE_genes_basic(infercnv_obj, test.use = 't', center_val=1)
## INFO [2018-10-22 13:12:47] Finding DE genes between malignant_MGH36 and Microglia/Macrophage
## INFO [2018-10-22 13:12:48] Found 3865 genes as DE
## INFO [2018-10-22 13:12:48] Finding DE genes between malignant_MGH36 and Oligodendrocytes (non-malignant)
## INFO [2018-10-22 13:12:49] Found 3485 genes as DE
## INFO [2018-10-22 13:12:49] Finding DE genes between malignant_MGH53 and Microglia/Macrophage
## INFO [2018-10-22 13:12:50] Found 2943 genes as DE
## INFO [2018-10-22 13:12:50] Finding DE genes between malignant_MGH53 and Oligodendrocytes (non-malignant)
## INFO [2018-10-22 13:12:51] Found 2554 genes as DE
## INFO [2018-10-22 13:12:51] Finding DE genes between malignant_93 and Microglia/Macrophage
## INFO [2018-10-22 13:12:52] Found 3146 genes as DE
## INFO [2018-10-22 13:12:52] Finding DE genes between malignant_93 and Oligodendrocytes (non-malignant)
## INFO [2018-10-22 13:12:53] Found 2576 genes as DE
## INFO [2018-10-22 13:12:53] Finding DE genes between malignant_97 and Microglia/Macrophage
## INFO [2018-10-22 13:12:55] Found 3237 genes as DE
## INFO [2018-10-22 13:12:55] Finding DE genes between malignant_97 and Oligodendrocytes (non-malignant)
## INFO [2018-10-22 13:12:56] Found 3027 genes as DE
save('infercnv_obj', file="infercnv_obj.non_DE_masked")
=======

Scale residual expression values according to the Spike-in

Perform rescaling of the data according to the spike-in w/ preset variation levels. Then, remove the spike-in data.

# rescale
infercnv_obj <- scale_cnv_by_spike(infercnv_obj)
# remove the spike-in
infercnv_obj <- remove_spike(infercnv_obj)
## INFO [2018-10-31 13:37:58] Removing spike

Mask out those genes that are not signficantly different from the normal cells

Runs a Wilcoxon rank test comparing tumor/normal for each patient and normal sample, and masks out those genes that are not significantly DE.

## INFO [2018-10-31 13:37:58] Finding DE genes between malignant_MGH36 and Microglia/Macrophage
## INFO [2018-10-31 13:38:01] Found 3349 genes / 7196 total as DE
## INFO [2018-10-31 13:38:01] Finding DE genes between malignant_MGH36 and Oligodendrocytes (non-malignant)
## INFO [2018-10-31 13:38:03] Found 2835 genes / 7196 total as DE
## INFO [2018-10-31 13:38:03] Finding DE genes between malignant_MGH53 and Microglia/Macrophage
## INFO [2018-10-31 13:38:06] Found 2077 genes / 7196 total as DE
## INFO [2018-10-31 13:38:06] Finding DE genes between malignant_MGH53 and Oligodendrocytes (non-malignant)
## INFO [2018-10-31 13:38:09] Found 1710 genes / 7196 total as DE
## INFO [2018-10-31 13:38:09] Finding DE genes between malignant_93 and Microglia/Macrophage
## INFO [2018-10-31 13:38:11] Found 2304 genes / 7196 total as DE
## INFO [2018-10-31 13:38:11] Finding DE genes between malignant_93 and Oligodendrocytes (non-malignant)
## INFO [2018-10-31 13:38:14] Found 1816 genes / 7196 total as DE
## INFO [2018-10-31 13:38:14] Finding DE genes between malignant_97 and Microglia/Macrophage
## INFO [2018-10-31 13:38:17] Found 2774 genes / 7196 total as DE
## INFO [2018-10-31 13:38:17] Finding DE genes between malignant_97 and Oligodendrocytes (non-malignant)
## INFO [2018-10-31 13:38:19] Found 2580 genes / 7196 total as DE
>>>>>>> 29a0b973d2701fe5ea2834efcd6a82dd542e0308
plot_cnv(infercnv_obj, 
         output_filename='infercnv.non-DE-genes-masked', 
         color_safe_pal = FALSE, 
         x.range=c(0,2), # want 0-2 post scaling by the spike-in 
         x.center=1, 
         title = "non-DE-genes-masked")
knitr::include_graphics("infercnv.non-DE-genes-masked.png")
<<<<<<< HEAD

Brighten it up by changing the scale threshold to our liking:

plot_cnv(infercnv_obj, 
         output_filename='infercnv.finalized_view', 
         color_safe_pal = FALSE, 
         x.range=c(0.7, 1.3), 
         x.center=1, 
         title = "InferCNV")
## INFO [2018-10-22 13:13:11] ::plot_cnv:Start
## INFO [2018-10-22 13:13:11] ::plot_cnv:Current data dimensions (r,c)=7196,184 Total=1322070.80902723 Min=0.353023266316393 Max=3.81124796173442.
## INFO [2018-10-22 13:13:11] ::plot_cnv:Depending on the size of the matrix this may take a moment.
## INFO [2018-10-22 13:13:11] plot_cnv_observation:Start
## INFO [2018-10-22 13:13:11] Observation data size: Cells= 142 Genes= 7196
## INFO [2018-10-22 13:13:11] clustering observations via method: ward.D
## INFO [2018-10-22 13:13:11] Number of genes in group(1) is 33
## INFO [2018-10-22 13:13:11] group size being clustered:  33,7196
## INFO [2018-10-22 13:13:11] Number of genes in group(2) is 34
## INFO [2018-10-22 13:13:11] group size being clustered:  34,7196
## INFO [2018-10-22 13:13:11] Number of genes in group(3) is 40
## INFO [2018-10-22 13:13:11] group size being clustered:  40,7196
## INFO [2018-10-22 13:13:11] Number of genes in group(4) is 35
## INFO [2018-10-22 13:13:11] group size being clustered:  35,7196
## INFO [2018-10-22 13:13:11] plot_cnv_observation:Writing observation groupings/color.
## INFO [2018-10-22 13:13:20] Colors for breaks:  #00008B,#24249B,#4848AB,#6D6DBC,#9191CC,#B6B6DD,#DADAEE,#FFFFFF,#EEDADA,#DDB6B6,#CC9191,#BC6D6D,#AB4848,#9B2424,#8B0000
## INFO [2018-10-22 13:13:20] Quantiles of plotted data range: 0.7,1,1,1.00080461831141,1.3
## INFO [2018-10-22 13:13:21] plot_cnv_references:Writing observation data to ./observations.txt
## INFO [2018-10-22 13:13:21] plot_cnv_references:Start
## INFO [2018-10-22 13:13:21] Reference data size: Cells= 42 Genes= 7196
## INFO [2018-10-22 13:13:21] plot_cnv_references:Number reference groups= 2
## INFO [2018-10-22 13:13:21] plot_cnv_references:Plotting heatmap.
## INFO [2018-10-22 13:13:24] Colors for breaks:  #00008B,#24249B,#4848AB,#6D6DBC,#9191CC,#B6B6DD,#DADAEE,#FFFFFF,#EEDADA,#DDB6B6,#CC9191,#BC6D6D,#AB4848,#9B2424,#8B0000
## INFO [2018-10-22 13:13:24] Quantiles of plotted data range: 0.7,1.00080461831141,1.00080461831141,1.00080461831141,1.3
## INFO [2018-10-22 13:13:24] plot_cnv_references:Writing reference data to ./references.txt
## quartz_off_screen 
##                 2
knitr::include_graphics("infercnv.finalized_view.png")

=======

And that’s it. You can experiment with each step to fine-tune your data exploration. See the documentation for uploading the resulting data matrix into the Next Generation Clustered Heatmap Viewer for more interactive exploration of the infercnv-processed data: https://github.com/broadinstitute/inferCNV/wiki/Next-Generation-Clustered-Heat-Map

>>>>>>> 29a0b973d2701fe5ea2834efcd6a82dd542e0308